###Read the data and split into train and test set
ucars<-read.csv("~/Downloads/usedcars.csv")
head(ucars)
set.seed(30005)
trainRec<-sort(sample(1:nrow(ucars),size=72))
testRec<-setdiff(1:nrow(ucars),trainRec)
traincars<-ucars[trainRec,]
testcars<-ucars[testRec,]
#Introduce a variable shows used cars' age
traincars$Age <- 2025 - traincars$Year
testcars$Age <- 2025 - testcars$Year
#For readability, Introduce a variable to show price in 100,000s
traincars$price_in_100000 <- traincars$Price/100000
testcars$price_in_100000 <- testcars$Price/100000
#Find the relationship between price and numeric variables:Kilometers_Driven, Mileage, Engine, Power
par(mfrow = c(2, 2))
numeric_vars = c('Kilometers_Driven','Mileage','Engine','Power')
# Base R version (one-by-one)
for (var in numeric_vars) {
plot(traincars[[var]], traincars$price_in_100000,
xlab = var, ylab = "Price in 100,000s",
main = paste("Price vs", var),
pch = 19, col = "steelblue")
}
###Find the relationship between price and categorical variables using boxplot
par(mfrow = c(3, 1))
boxplot(price_in_100000 ~ Fuel_Type, data = traincars,
main = "Price by Fuel Type", col = "lightblue")
boxplot(price_in_100000 ~ Transmission, data = traincars,
main = "Price by Transmission", col = "lightgreen")
boxplot(price_in_100000 ~ Owner_Type, data = traincars,
main = "Price by Owner Type", col = "lightpink")
###Find the relationship between price and categorical variables using boxplot
par(mfrow = c(3, 1))
boxplot(price_in_100000 ~ Brand, data = traincars,
main = "Price by Brand", col = "lightgoldenrod")
boxplot(price_in_100000 ~ Year, data = traincars,
main = "Price by Year", col = "palegreen")
boxplot(price_in_100000 ~ Seats, data = traincars,
main = "Price by number of seats", col = "yellow")
###(3) ###transform categorical variables Agem Fuel_Type, Owner_Type, Transmission to numerical variables
traincars$Owner_Type <- factor(traincars$Owner_Type, levels = c("First", "Second", "Third"), labels = c(1, 2, 3))
###choosing predictors using AIC step-wise method:
nullModel<-lm(Price~1, data = traincars) #Model only with intercept
fullModel <- lm(Price ~ Brand + Age + Kilometers_Driven + Fuel_Type + Transmission + Owner_Type + Mileage + Engine + Power + Seats, data=traincars)
step(nullModel, scope=list(lower=nullModel, upper=fullModel),direction="both")
Start: AIC=1990.5
Price ~ 1
Df Sum of Sq RSS AIC
+ Power 1 5.1251e+13 1.9823e+13 1900.6
+ Brand 10 5.2312e+13 1.8762e+13 1914.6
+ Engine 1 3.5792e+13 3.5282e+13 1942.1
+ Transmission 1 3.5181e+13 3.5892e+13 1943.3
+ Mileage 1 2.7868e+13 4.3206e+13 1956.7
+ Owner_Type 2 1.0999e+13 6.0075e+13 1982.4
+ Age 1 5.5270e+12 6.5547e+13 1986.7
<none> 7.1074e+13 1990.5
+ Fuel_Type 1 6.2500e+11 7.0449e+13 1991.9
+ Seats 1 1.3638e+10 7.1060e+13 1992.5
+ Kilometers_Driven 1 6.0844e+09 7.1068e+13 1992.5
Step: AIC=1900.57
Price ~ Power
Df Sum of Sq RSS AIC
+ Brand 10 1.4829e+13 4.9940e+12 1821.3
+ Transmission 1 4.6105e+12 1.5213e+13 1883.5
+ Fuel_Type 1 1.7123e+12 1.8111e+13 1896.1
+ Seats 1 1.2818e+12 1.8541e+13 1897.8
<none> 1.9823e+13 1900.6
+ Owner_Type 2 9.7582e+11 1.8847e+13 1900.9
+ Mileage 1 3.9791e+11 1.9425e+13 1901.1
+ Age 1 1.4236e+11 1.9681e+13 1902.0
+ Engine 1 9.9061e+10 1.9724e+13 1902.2
+ Kilometers_Driven 1 2.7399e+09 1.9820e+13 1902.6
- Power 1 5.1251e+13 7.1074e+13 1990.5
Step: AIC=1821.31
Price ~ Power + Brand
Df Sum of Sq RSS AIC
+ Fuel_Type 1 9.4813e+11 4.0459e+12 1808.2
+ Seats 1 6.1340e+11 4.3806e+12 1813.9
+ Transmission 1 4.6484e+11 4.5292e+12 1816.3
+ Mileage 1 2.4633e+11 4.7477e+12 1819.7
+ Engine 1 1.5615e+11 4.8379e+12 1821.0
<none> 4.9940e+12 1821.3
+ Kilometers_Driven 1 7.5671e+10 4.9183e+12 1822.2
+ Age 1 3.0485e+10 4.9635e+12 1822.9
+ Owner_Type 2 1.1992e+11 4.8741e+12 1823.6
- Brand 10 1.4829e+13 1.9823e+13 1900.6
- Power 1 1.3768e+13 1.8762e+13 1914.6
Step: AIC=1808.15
Price ~ Power + Brand + Fuel_Type
Df Sum of Sq RSS AIC
+ Mileage 1 2.6867e+11 3.7772e+12 1805.2
+ Transmission 1 2.3113e+11 3.8148e+12 1805.9
+ Seats 1 1.9533e+11 3.8505e+12 1806.6
<none> 4.0459e+12 1808.2
+ Age 1 3.4376e+10 4.0115e+12 1809.5
+ Kilometers_Driven 1 1.5652e+10 4.0302e+12 1809.9
+ Engine 1 1.2654e+09 4.0446e+12 1810.1
+ Owner_Type 2 3.6448e+10 4.0094e+12 1811.5
- Fuel_Type 1 9.4813e+11 4.9940e+12 1821.3
- Brand 10 1.4065e+13 1.8111e+13 1896.1
- Power 1 1.4380e+13 1.8426e+13 1915.3
Step: AIC=1805.2
Price ~ Power + Brand + Fuel_Type + Mileage
Df Sum of Sq RSS AIC
+ Transmission 1 3.2901e+11 3.4482e+12 1800.6
+ Kilometers_Driven 1 1.1473e+11 3.6625e+12 1805.0
<none> 3.7772e+12 1805.2
+ Age 1 9.9348e+10 3.6779e+12 1805.3
+ Seats 1 5.4295e+10 3.7229e+12 1806.2
+ Engine 1 4.1425e+10 3.7358e+12 1806.4
- Mileage 1 2.6867e+11 4.0459e+12 1808.2
+ Owner_Type 2 1.4600e+10 3.7626e+12 1808.9
- Fuel_Type 1 9.7046e+11 4.7477e+12 1819.7
- Power 1 7.9986e+12 1.1776e+13 1885.1
- Brand 10 1.3729e+13 1.7507e+13 1895.6
Step: AIC=1800.64
Price ~ Power + Brand + Fuel_Type + Mileage + Transmission
Df Sum of Sq RSS AIC
+ Kilometers_Driven 1 1.0769e+11 3.3405e+12 1800.3
+ Seats 1 1.0459e+11 3.3436e+12 1800.4
<none> 3.4482e+12 1800.6
+ Age 1 5.9766e+10 3.3884e+12 1801.4
+ Engine 1 2.3657e+10 3.4245e+12 1802.1
+ Owner_Type 2 3.2029e+10 3.4162e+12 1804.0
- Transmission 1 3.2901e+11 3.7772e+12 1805.2
- Mileage 1 3.6655e+11 3.8148e+12 1805.9
- Fuel_Type 1 6.9992e+11 4.1481e+12 1811.9
- Power 1 4.5691e+12 8.0173e+12 1859.4
- Brand 10 9.9217e+12 1.3370e+13 1878.2
Step: AIC=1800.35
Price ~ Power + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven
Df Sum of Sq RSS AIC
<none> 3.3405e+12 1800.3
+ Seats 1 8.0771e+10 3.2597e+12 1800.6
- Kilometers_Driven 1 1.0769e+11 3.4482e+12 1800.6
+ Engine 1 2.9204e+10 3.3113e+12 1801.7
+ Age 1 1.9443e+09 3.3386e+12 1802.3
+ Owner_Type 2 6.0638e+10 3.2799e+12 1803.0
- Transmission 1 3.2197e+11 3.6625e+12 1805.0
- Mileage 1 4.6644e+11 3.8070e+12 1807.8
- Fuel_Type 1 8.0407e+11 4.1446e+12 1813.9
- Power 1 4.4575e+12 7.7980e+12 1859.4
- Brand 10 9.2120e+12 1.2552e+13 1875.7
Call:
lm(formula = Price ~ Power + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
Coefficients:
(Intercept) Power BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
2.477e+06 5.657e+03 4.238e+04 -9.910e+05 -8.947e+05 -1.166e+06 -1.194e+06 -9.918e+05
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual Kilometers_Driven
2.926e+03 -8.977e+05 -7.333e+05 -8.508e+05 -2.632e+05 -4.236e+04 -2.207e+05 -6.367e+00
###choosing predictors using BIC step-wise method:
step(nullModel, scope=list(lower=nullModel, upper=fullModel),direction="both",k=log(nrow(traincars)))
Start: AIC=1992.78
Price ~ 1
Df Sum of Sq RSS AIC
+ Power 1 5.1251e+13 1.9823e+13 1905.1
+ Brand 10 5.2312e+13 1.8762e+13 1939.7
+ Engine 1 3.5792e+13 3.5282e+13 1946.6
+ Transmission 1 3.5181e+13 3.5892e+13 1947.9
+ Mileage 1 2.7868e+13 4.3206e+13 1961.2
+ Owner_Type 2 1.0999e+13 6.0075e+13 1989.2
+ Age 1 5.5270e+12 6.5547e+13 1991.2
<none> 7.1074e+13 1992.8
+ Fuel_Type 1 6.2500e+11 7.0449e+13 1996.4
+ Seats 1 1.3638e+10 7.1060e+13 1997.0
+ Kilometers_Driven 1 6.0844e+09 7.1068e+13 1997.0
Step: AIC=1905.12
Price ~ Power
Df Sum of Sq RSS AIC
+ Brand 10 1.4829e+13 4.9940e+12 1848.6
+ Transmission 1 4.6105e+12 1.5213e+13 1890.3
+ Fuel_Type 1 1.7123e+12 1.8111e+13 1902.9
+ Seats 1 1.2818e+12 1.8541e+13 1904.6
<none> 1.9823e+13 1905.1
+ Mileage 1 3.9791e+11 1.9425e+13 1907.9
+ Age 1 1.4236e+11 1.9681e+13 1908.9
+ Engine 1 9.9061e+10 1.9724e+13 1909.0
+ Kilometers_Driven 1 2.7399e+09 1.9820e+13 1909.4
+ Owner_Type 2 9.7582e+11 1.8847e+13 1910.0
- Power 1 5.1251e+13 7.1074e+13 1992.8
Step: AIC=1848.63
Price ~ Power + Brand
Df Sum of Sq RSS AIC
+ Fuel_Type 1 9.4813e+11 4.0459e+12 1837.7
+ Seats 1 6.1340e+11 4.3806e+12 1843.5
+ Transmission 1 4.6484e+11 4.5292e+12 1845.9
<none> 4.9940e+12 1848.6
+ Mileage 1 2.4633e+11 4.7477e+12 1849.3
+ Engine 1 1.5615e+11 4.8379e+12 1850.6
+ Kilometers_Driven 1 7.5671e+10 4.9183e+12 1851.8
+ Age 1 3.0485e+10 4.9635e+12 1852.5
+ Owner_Type 2 1.1992e+11 4.8741e+12 1855.4
- Brand 10 1.4829e+13 1.9823e+13 1905.1
- Power 1 1.3768e+13 1.8762e+13 1939.7
Step: AIC=1837.74
Price ~ Power + Brand + Fuel_Type
Df Sum of Sq RSS AIC
+ Mileage 1 2.6867e+11 3.7772e+12 1837.1
<none> 4.0459e+12 1837.7
+ Transmission 1 2.3113e+11 3.8148e+12 1837.8
+ Seats 1 1.9533e+11 3.8505e+12 1838.5
+ Age 1 3.4376e+10 4.0115e+12 1841.4
+ Kilometers_Driven 1 1.5652e+10 4.0302e+12 1841.7
+ Engine 1 1.2654e+09 4.0446e+12 1842.0
+ Owner_Type 2 3.6448e+10 4.0094e+12 1845.7
- Fuel_Type 1 9.4813e+11 4.9940e+12 1848.6
- Brand 10 1.4065e+13 1.8111e+13 1902.9
- Power 1 1.4380e+13 1.8426e+13 1942.6
Step: AIC=1837.07
Price ~ Power + Brand + Fuel_Type + Mileage
Df Sum of Sq RSS AIC
+ Transmission 1 3.2901e+11 3.4482e+12 1834.8
<none> 3.7772e+12 1837.1
- Mileage 1 2.6867e+11 4.0459e+12 1837.7
+ Kilometers_Driven 1 1.1473e+11 3.6625e+12 1839.1
+ Age 1 9.9348e+10 3.6779e+12 1839.4
+ Seats 1 5.4295e+10 3.7229e+12 1840.3
+ Engine 1 4.1425e+10 3.7358e+12 1840.6
+ Owner_Type 2 1.4600e+10 3.7626e+12 1845.3
- Fuel_Type 1 9.7046e+11 4.7477e+12 1849.3
- Brand 10 1.3729e+13 1.7507e+13 1904.7
- Power 1 7.9986e+12 1.1776e+13 1914.7
Step: AIC=1834.79
Price ~ Power + Brand + Fuel_Type + Mileage + Transmission
Df Sum of Sq RSS AIC
<none> 3.4482e+12 1834.8
+ Kilometers_Driven 1 1.0769e+11 3.3405e+12 1836.8
+ Seats 1 1.0459e+11 3.3436e+12 1836.8
- Transmission 1 3.2901e+11 3.7772e+12 1837.1
- Mileage 1 3.6655e+11 3.8148e+12 1837.8
+ Age 1 5.9766e+10 3.3884e+12 1837.8
+ Engine 1 2.3657e+10 3.4245e+12 1838.6
+ Owner_Type 2 3.2029e+10 3.4162e+12 1842.7
- Fuel_Type 1 6.9992e+11 4.1481e+12 1843.8
- Brand 10 9.9217e+12 1.3370e+13 1889.6
- Power 1 4.5691e+12 8.0173e+12 1891.3
Call:
lm(formula = Price ~ Power + Brand + Fuel_Type + Mileage + Transmission,
data = traincars)
Coefficients:
(Intercept) Power BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
2134420 5715 83466 -969348 -935963 -1204552 -1167995 -965189
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual
33754 -901201 -755428 -845437 -222150 -35033 -223043
###choosing predictors using subset method:
library("leaps")
Bestfits <- regsubsets(Price~ Brand + Age + Kilometers_Driven + Fuel_Type + Transmission + Owner_Type + Mileage + Engine + Power + Seats, data=traincars, nbest=1)
summary(Bestfits)
Subset selection object
Call: regsubsets.formula(Price ~ Brand + Age + Kilometers_Driven +
Fuel_Type + Transmission + Owner_Type + Mileage + Engine +
Power + Seats, data = traincars, nbest = 1)
20 Variables (and intercept)
Forced in Forced out
BrandBMW FALSE FALSE
BrandFord FALSE FALSE
BrandHonda FALSE FALSE
BrandHyundai FALSE FALSE
BrandMahindra FALSE FALSE
BrandMaruti FALSE FALSE
BrandMercedes FALSE FALSE
BrandTata FALSE FALSE
BrandToyota FALSE FALSE
BrandVolkswagen FALSE FALSE
Age FALSE FALSE
Kilometers_Driven FALSE FALSE
Fuel_TypePetrol FALSE FALSE
TransmissionManual FALSE FALSE
Owner_Type2 FALSE FALSE
Owner_Type3 FALSE FALSE
Mileage FALSE FALSE
Engine FALSE FALSE
Power FALSE FALSE
Seats FALSE FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes BrandTata BrandToyota BrandVolkswagen Age Kilometers_Driven
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " "
4 ( 1 ) "*" " " " " " " " " " " "*" " " " " " " " " " "
5 ( 1 ) "*" " " " " " " " " " " "*" " " " " " " " " " "
6 ( 1 ) "*" " " " " "*" " " " " "*" " " " " " " " " " "
7 ( 1 ) "*" " " " " "*" " " " " "*" " " " " " " "*" " "
8 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" " " "*" " " " "
Fuel_TypePetrol TransmissionManual Owner_Type2 Owner_Type3 Mileage Engine Power Seats
1 ( 1 ) " " " " " " " " " " " " "*" " "
2 ( 1 ) " " "*" " " " " " " " " "*" " "
3 ( 1 ) " " "*" " " " " " " " " "*" " "
4 ( 1 ) " " "*" " " " " " " " " "*" " "
5 ( 1 ) " " "*" " " " " " " " " "*" "*"
6 ( 1 ) " " "*" " " " " " " " " "*" "*"
7 ( 1 ) " " "*" " " " " " " " " "*" "*"
8 ( 1 ) " " " " " " " " " " " " "*" " "
plot(Bestfits, scale = "r2") #use r2
plot(Bestfits, scale = "adjr2") #Use adj_r2
plot(Bestfits, scale = "bic") #Use bic
#AIC model
model1 <- lm(Price ~ Power + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
model1
Call:
lm(formula = Price ~ Power + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
Coefficients:
(Intercept) Power BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
2.477e+06 5.657e+03 4.238e+04 -9.910e+05 -8.947e+05 -1.166e+06 -1.194e+06 -9.918e+05
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual Kilometers_Driven
2.926e+03 -8.977e+05 -7.333e+05 -8.508e+05 -2.632e+05 -4.236e+04 -2.207e+05 -6.367e+00
par(mfrow=c(2,2))
AIC_lm = lm(formula = Price ~ Brand + Fuel_Type + Mileage + Transmission + Kilometers_Driven, data = traincars)
plot(AIC_lm)
#BIC mdoel
model2 <- lm(Price ~ Power + Brand + Fuel_Type + Mileage + Transmission, data = traincars)
model2
Call:
lm(formula = Price ~ Power + Brand + Fuel_Type + Mileage + Transmission,
data = traincars)
Coefficients:
(Intercept) Power BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
2134420 5715 83466 -969348 -935963 -1204552 -1167995 -965189
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual
33754 -901201 -755428 -845437 -222150 -35033 -223043
par(mfrow=c(2,2))
BIC_lm = lm(formula = Price ~ Brand + Fuel_Type + Mileage + Transmission + Power, data = traincars)
plot(BIC_lm)
#subset model
model3 <- lm(Price ~ Brand + Transmission + Power + Seats, data = traincars)
model3
Call:
lm(formula = Price ~ Brand + Transmission + Power + Seats, data = traincars)
Coefficients:
(Intercept) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes
475735 87185 -877747 -1033760 -1175541 -1022604 -1020954 95158
BrandTata BrandToyota BrandVolkswagen TransmissionManual Power Seats
-935651 -764668 -844961 -288212 6374 160413
par(mfrow=c(2,2))
Sub_lm = lm(formula = Price ~ Brand + Transmission + Power + Seats, data = traincars)
plot(Sub_lm)
#AIC model with only logged power
model1_1 <- lm(Price ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
model1_1
Call:
lm(formula = Price ~ log(Power) + Brand + Fuel_Type + Mileage +
Transmission + Kilometers_Driven, data = traincars)
Coefficients:
(Intercept) log(Power) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
-2.749e+06 1.209e+06 2.706e+04 -8.947e+05 -7.294e+05 -1.016e+06 -1.110e+06 -8.591e+05
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual Kilometers_Driven
2.318e+04 -7.739e+05 -6.981e+05 -7.343e+05 -1.802e+05 -3.697e+04 -1.472e+05 -1.118e+01
#BIC model with only logged power
model2_1 <- lm(Price ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission, data = traincars)
model2_1
Call:
lm(formula = Price ~ log(Power) + Brand + Fuel_Type + Mileage +
Transmission, data = traincars)
Coefficients:
(Intercept) log(Power) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
-3199388 1191453 108769 -852837 -796458 -1085552 -1057885 -817465
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual
80254 -780618 -739189 -734490 -106670 -26502 -166047
#Subset model with only logged power
model3_1 <- lm(Price ~ Brand + Transmission + log(Power) + Seats, data = traincars)
model3_1
Call:
lm(formula = Price ~ Brand + Transmission + log(Power) + Seats,
data = traincars)
Coefficients:
(Intercept) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes
-4976783 103876 -779089 -870159 -1054686 -989824 -821630 126517
BrandTata BrandToyota BrandVolkswagen TransmissionManual log(Power) Seats
-783899 -748247 -697566 -174881 1331982 104265
#AIC model with both logged price and logged power
model1_2 <- lm(log(Price) ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
model1_2
Call:
lm(formula = log(Price) ~ log(Power) + Brand + Fuel_Type + Mileage +
Transmission + Kilometers_Driven, data = traincars)
Coefficients:
(Intercept) log(Power) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
1.164e+01 7.248e-01 8.534e-04 -4.748e-01 -3.892e-01 -6.584e-01 -6.853e-01 -4.646e-01
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual Kilometers_Driven
-3.923e-02 -4.384e-01 -3.386e-01 -3.862e-01 -2.197e-01 -3.302e-02 -2.012e-01 -2.882e-06
#BIC model with both logged price and logged power
model2_2 <- lm(log(Price) ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission, data = traincars)
model2_2
Call:
lm(formula = log(Price) ~ log(Power) + Brand + Fuel_Type + Mileage +
Transmission, data = traincars)
Coefficients:
(Intercept) log(Power) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti
11.52560 0.72024 0.02191 -0.46404 -0.40652 -0.67650 -0.67184 -0.45387
BrandMercedes BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual
-0.02452 -0.44012 -0.34923 -0.38629 -0.20070 -0.03032 -0.20603
#Subset model with both logged price and logged power
model3_2 <- lm(log(Price) ~ Brand + Transmission + log(Power) + Seats, data = traincars)
model3_2
Call:
lm(formula = log(Price) ~ Brand + Transmission + log(Power) +
Seats, data = traincars)
Coefficients:
(Intercept) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes
9.533864 0.008435 -0.399509 -0.460418 -0.641015 -0.552593 -0.479700 0.014670
BrandTata BrandToyota BrandVolkswagen TransmissionManual log(Power) Seats
-0.458189 -0.341865 -0.371543 -0.229344 0.876573 0.109526
#Checking for the residual using AIC
predicted_prices <- predict(model1, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model1, newdata = testcars)
# Plot histogram to see if it is more normal
hist(residuals,
main = "Histogram of AIC (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using AIC logged power
predicted_prices <- predict(model1_1, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model1_1, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of AIC log power (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using AIC both logged price and logged power
predicted_prices <- predict(model1_2, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model1_2, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of AIC log price and log power",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using BIC
predicted_prices <- predict(model2, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model2, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of BIC (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using BIC power logged
predicted_prices <- predict(model2_1, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model2_1, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of BIC log power (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using BIC both logged price and logged power
predicted_prices <- predict(model2_2, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model2_2, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of BIC log price and log power",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using subset
predicted_prices <- predict(model3, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model3, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of Subset (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
NA
NA
#Checking for the residual using subset power logged
predicted_prices <- predict(model3_1, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model3_1, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of subset log power(Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
#Checking for the residual using subset both logged price and logged power
predicted_prices <- predict(model3_2, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(model3_2, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of subset log price and log pwoer",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
NA
NA
NA
#Final decision for the model: AIC logged Power and check for the validity
par(mfrow=c(2,2))
loggedAIC_lm = lm(formula = Price ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission +
Kilometers_Driven, data = traincars)
plot(loggedAIC_lm)
par(mfrow=c(2,2))
loggedBIC_lm = lm(formula = Price ~ log(Power) + Brand + Fuel_Type + Mileage + Transmission, data = traincars)
plot(loggedBIC_lm)
par(mfrow=c(2,2))
loggedSub_lm = lm(formula = Price ~ Brand + Transmission + log(Power) + Seats, data = traincars)
plot(loggedSub_lm)
###additional information about multicolinearity
#Checking for multicolinearity
library(car)
vif(model1)
GVIF Df GVIF^(1/(2*Df))
Power 3.484163 1 1.866591
Brand 6.596049 10 1.098915
Fuel_Type 1.531354 1 1.237479
Mileage 3.084286 1 1.756214
Transmission 2.670094 1 1.634042
Kilometers_Driven 1.834917 1 1.354591
vif(model2)
GVIF Df GVIF^(1/(2*Df))
Power 3.469040 1 1.862536
Brand 4.861174 10 1.082274
Fuel_Type 1.253487 1 1.119592
Mileage 2.684819 1 1.638542
Transmission 2.669180 1 1.633762
vif(model3)
GVIF Df GVIF^(1/(2*Df))
Brand 3.664279 10 1.067086
Transmission 2.499584 1 1.581007
Power 2.276120 1 1.508682
Seats 1.364545 1 1.168137
###Considering for the cost with all the reasonable predictors
lss_lm <- lm(Price ~ Brand + Fuel_Type + Mileage + Transmission + Power + Kilometers_Driven + Seats, data = traincars)
lss_lm
Call:
lm(formula = Price ~ Brand + Fuel_Type + Mileage + Transmission +
Power + Kilometers_Driven + Seats, data = traincars)
Coefficients:
(Intercept) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes
1.901e+06 5.948e+04 -9.473e+05 -9.289e+05 -1.147e+06 -1.145e+06 -9.899e+05 3.322e+04
BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual Power Kilometers_Driven
-8.923e+05 -7.572e+05 -8.296e+05 -2.201e+05 -3.359e+04 -2.380e+05 5.854e+03 -5.664e+00
Seats
6.552e+04
predicted_prices <- predict(lss_lm, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(lss_lm, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of lss log power (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
lss_lm_1 <- lm(Price ~ Brand + Fuel_Type + Mileage + Transmission + log(Power) + Kilometers_Driven + Seats, data = traincars)
lss_lm_1
Call:
lm(formula = Price ~ Brand + Fuel_Type + Mileage + Transmission +
log(Power) + Kilometers_Driven + Seats, data = traincars)
Coefficients:
(Intercept) BrandBMW BrandFord BrandHonda BrandHyundai BrandMahindra BrandMaruti BrandMercedes
-3.350e+06 4.111e+04 -8.563e+05 -7.524e+05 -9.958e+05 -1.067e+06 -8.543e+05 4.853e+04
BrandTata BrandToyota BrandVolkswagen Fuel_TypePetrol Mileage TransmissionManual log(Power) Kilometers_Driven
-7.661e+05 -7.167e+05 -7.145e+05 -1.429e+05 -2.984e+04 -1.602e+05 1.241e+06 -1.074e+01
Seats
5.325e+04
predicted_prices <- predict(lss_lm_1, newdata = testcars)
plot(testcars$Price, predicted_prices,
xlab = "Actual Price",
ylab = "Predicted Price",
main = "Actual vs Predicted Prices",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2) # perfect prediction line
residuals <- testcars$Price - predict(lss_lm_1, newdata = testcars)
# Plot histogram
hist(residuals,
main = "Histogram of lss log power (Actual - Predicted Price)",
xlab = "Residual (Price - Predicted)",
col = "skyblue",
border = "white",
breaks = 10)
par(mfrow=c(2,2))
#Choose variables in lasso to perform a regular linear regression.
lss_lm = lm(formula = Price ~ Brand + Fuel_Type + Mileage + Transmission + Power + Kilometers_Driven + Seats, data = traincars)
plot(lss_lm)